This ipython file is the project by Hongyi Tang and Weijian Li for course 12752. There are four ipython files in the project in total. Each file consist of one cluster analysis task. In this file, the cluster analysis is demonstrated to 5 building types.
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
%matplotlib inline
Please download the dataset and change the file path.
In [2]:
# Read in CBECS data
f = open('C:/F16-12-752-master/projects/thongyi_weijian1/data/CBECS.csv')
data = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])
#data = pd.DataFrame.from_csv('../../lectures/data/CBECS.csv')
data = data.set_index('PUBID')
data.tail()
Out[2]:
Office, inpatient health care, service, public assembly and education buildings are selected.
In [74]:
energydata=pd.DataFrame()
type_B=[2,16,26,13,14] # office, inpatient health care, service, public assembly and education
type_C=[1,3,4,5,6,7,8,9,10,11,12,15,17,18,19,20,21,22,23,24,25,91]
data_type=data
data_type=data_type[data_type.NGUSED!=2]
for i in type_C:
data_type=data_type[data_type.PBA!=i]
energydata['Building Type']=data_type.PBA
index=['ELBTU','NGBTU','ELVNBTU','NGHTBTU']
for i in index:
energydata[i]=data_type[i]/data_type.SQFT
energydata
Out[74]:
In [75]:
energydata = energydata.dropna(how='any')
energydata = energydata[~(energydata == 0).any(axis=1)]
PBA1=energydata['Building Type'].unique()
PBA1
count=[]
for i in PBA1:
count.append([energydata[energydata['Building Type']==i].shape[0],i])
count
Out[75]:
In [91]:
type_C=['Office','Inpatient health care','Service','Public assembly','Education']
fig1 = plt.figure(figsize=(20,15))
times=1
data_seperate=[]
# energydata[energydata['Building Type']==type_B[1]]
for i in range(len(type_B)):
x=energydata[energydata['Building Type']==type_B[i]]
x=x.drop(x.columns[0],axis=1)
data_seperate.append(x)
for i in range(len(type_B)):
plt.subplot(len(type_B),2,times)
data_seperate[i].boxplot()
times=times+1
plt.title(type_C[i])
plt.ylim(0,200)
In [80]:
y=pd.DataFrame()
for i in range(len(type_B)):
y=y.append(data_seperate[i])
X=y.as_matrix().astype(np.float32)
from sklearn.cluster import KMeans
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
plt.subplot(num_clust,2,cluster_id+1)
cluster_members = X[cluster_assignments==cluster_id,:]
print(len(cluster_members))
for i in range(len(cluster_members)):
plt.plot(cluster_members[i,:], color='grey', lw='0.1')
plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
In [81]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)
In [82]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
plt.subplot(num_clust+1,2,cluster_id+1)
cluster_members = X[cluster_assignments==cluster_id,:]
print(len(cluster_members))
for i in range(len(cluster_members)):
plt.plot(cluster_members[i,:], color='grey', lw='0.1')
plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
In [83]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)
In [84]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
plt.subplot(num_clust+1,2,cluster_id+1)
cluster_members = X[cluster_assignments==cluster_id,:]
print(len(cluster_members))
for i in range(len(cluster_members)):
plt.plot(cluster_members[i,:], color='grey', lw='0.1')
plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
In [85]:
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=2]
del y['assignment']
X=y.as_matrix().astype(np.float32)
In [86]:
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
plt.subplot(num_clust+1,2,cluster_id+1)
cluster_members = X[cluster_assignments==cluster_id,:]
print(len(cluster_members))
for i in range(len(cluster_members)):
plt.plot(cluster_members[i,:], color='grey', lw='0.1')
plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
The connection between assignment and building type need to be found by visually judging. The connecting can be different every time the file was run.
In [87]:
y['assignment']=cluster_assignments
y=y.join(data['PBA'],how='inner')
y
y['judge']=1
y['judge'].iloc[np.where(np.array(y.PBA)==2)]=0
y['judge'].iloc[np.where(np.array(y.PBA)==16)]=3
y['judge'].iloc[np.where(np.array(y.PBA)==26)]=1
y['judge'].iloc[np.where(np.array(y.PBA)==13)]=4
y['judge'].iloc[np.where(np.array(y.PBA)==14)]=2
y[y['judge']==y['assignment']].count()
Out[87]:
In [88]:
a=727/(714+283+244+226+471)
In [89]:
a
Out[89]:
In [ ]: